#loading in data
patents <- read_excel("Patent_County.xlsx", skip = 1)
## Warning in read_fun(path = path, sheet_i = sheet, limits = limits, shim =
## shim, : Expecting numeric in A3022 / R3022C1: got 'ALL'
#gini coefficient
GC2006 <- read.csv("GC2006-2010.csv")
GC2011 <- read.csv("GC2011-2015.csv")

#per capita income
PCI2006 <- read.csv("PCI2006.csv")
PCI2011 <- read.csv("PCI2010.csv")

#Race
Race2006 <- read.csv("Race2006.csv")
Race2011 <- read.csv("Race2010.csv")

#Education by Sex
Education2006 <- read.csv("EducationSex2006.csv")
Education2011 <- read.csv("EducationSex2010.csv")

#county coordinate data
coordinates <- read_xlsx('County Coordinates.xlsx', skip = 1)
#editng the data sets
#trimming out unnecessary columns
#Gini Coefficients edits
#selecting and getting rid of columns
GC2006 <- GC2006 %>%
  select(GISJOIN, YEAR, STATE:COUNTYA, NAME_E:J4TM001)

GC2011 <- GC2011 %>%
  select(GISJOIN, YEAR, STATE:COUNTYA, NAME_E:AD4BM001)

#renaming Gini Coefficient Columns
GC2006 <- GC2006 %>%
  rename(c("J4TE001" = "GC_Estimate6", "J4TM001" = "GC_Margin_of_Error6"))

GC2011 <- GC2011 %>% 
  rename(c("AD4BE001" = "GC_Estimate11", "AD4BM001" = "GC_Margin_of_Error11"))
#per capita income edits
#selecting and getting rid of columns
PCI2006 <- PCI2006 %>%
  select(GISJOIN, YEAR, STATE:COUNTYA, NAME_E:JQBM001)

PCI2011 <- PCI2011 %>%
  select(GISJOIN, YEAR, STATE:COUNTYA, NAME_E:ADOLM001)

#renaming per capita income
PCI2006 <- PCI2006 %>%
  rename(c("JQBE001" = "PCI_Estimate6", "JQBM001" = "PCI_Margin_of_Error6"))

PCI2011 <- PCI2011 %>%
  rename(c("ADOLE001" = "PCI_Estimate11", "ADOLM001" = "PCI_Margin_of_Error11"))
#Race edits
#selecting and getting rid of columns
Race2006 <- Race2006 %>%
  select(GISJOIN, YEAR, STATE:COUNTYA, NAME_E:JMBM010)

Race2011 <- Race2011 %>%
  select(GISJOIN, YEAR, STATE:COUNTYA, NAME_E:ADKXM010)

#renaming race
Race2006 <- Race2006 %>%
  rename(c("JMBE001" = "Total_Population6", "JMBE002" = "White6", "JMBE003" = "Black6", "JMBE004" = "Native_American6", "JMBE005" = "Asian6", "JMBE006" = "Pacific6", "JMBE007" = "Some_Other6", "JMBE008" = "Two_or_More6", "JMBE009" = "Two_or_More_Some_Other6", "JMBE010" = "Three_or_More6",
           "JMBM001" = "Total_PopulationErr6", "JMBM002" = "WhiteErr6", "JMBM003" = "BlackErr6", "JMBM004" = "Native_AmericanErr6", "JMBM005" = "AsianErr6", "JMBM006" = "PacificErr6", "JMBM007" = "Some_OtherErr6", "JMBM008" = "Two_or_MoreErr6", "JMBM009" = "Two_or_More_Some_OtherErr6", "JMBM010" = "Three_or_MoreErr6"))

Race2011 <- Race2011 %>%
  rename(c("ADKXE001" = "Total_Population11", "ADKXE002" = "White11", "ADKXE003" = "Black11", "ADKXE004" = "Native_American11", "ADKXE005" = "Asian11", "ADKXE006" = "Pacific11", "ADKXE007" = "Some_Other11", "ADKXE008" = "Two_or_More11", "ADKXE009" = "Two_or_More_Some_Other11", "ADKXE010" = "Three_or_More11",
           "ADKXM001" = "Total_PopulationErr11", "ADKXM002" = "WhiteErr11", "ADKXM003" = "BlackErr11", "ADKXM004" = "Native_AmericanErr11", "ADKXM005" = "AsianErr11", "ADKXM006" = "PacificErr11", "ADKXM007" = "Some_OtherErr11", "ADKXM008" = "Two_or_MoreErr11", "ADKXM009" = "Two_or_More_Some_OtherErr11", "ADKXM010" = "Three_or_MoreErr11"))
#Education by Sex edits
#selecting and getting rid of columns
Education2006 <- Education2006 %>%
  select(GISJOIN, YEAR, STATE:COUNTYA, NAME_E:JN9M035)

Education2011 <- Education2011 %>%
  select(GISJOIN, YEAR, STATE:COUNTYA, NAME_E:ADMYM035)

#renaming Education
Education2006 <- Education2006 %>%
  rename(c("JN9E001" = "Total_Population6", "JN9E002" = "MalePop6", "JN9E003" = "MNoSchool6", "JN9E004" = "M4thGrade6", "JN9E005" = "M5thGrade6", "JN9E006" = "M7thGrade6", "JN9E007" = "M9thGrade6", "JN9E008" = "M10thGrade6", "JN9E009" = "M11thGrade6", "JN9E010" = "M12thGrade6", "JN9E011" = "MHighSchoolGrad6", "JN9E012" = "MSomeCollege16", "JN9E013" = "MSomeCollege1Plus6", "JN9E014" = "MAssociate6", "JN9E015" = "MBachelor6", "JN9E016" = "MMaster6", "JN9E017" = "MProfessional6", "JN9E018" = "MDoctorate6",
           "JN9E019" = "FemalePop6", "JN9E020" = "FNoSchool6", "JN9E021" = "F4thGrade6", "JN9E022" = "F5thGrade6", "JN9E023" = "F7thGrade6", "JN9E024" = "F9thGrade6", "JN9E025" = "F10thGrade6", "JN9E026" = "F11thGrade6", "JN9E027" = "F12thGrade6", "JN9E028" = "FHighSchoolGrad6", "JN9E029" = "FSomeCollege16", "JN9E030" = "FSomeCollege1Plus6", "JN9E031" = "FAssociate6", "JN9E032" = "FBachelor6", "JN9E033" = "FMaster6", "JN9E034" = "FProfessional6", "JN9E035" = "FDoctorate6"))

Education2011 <- Education2011 %>%
  rename(c("ADMYE001" = "Total_Population11", "ADMYE002" = "MalePop11", "ADMYE003" = "MNoSchool11", "ADMYE004" = "M4thGrade11", "ADMYE005" = "M5thGrade11", "ADMYE006" = "M7thGrade11", "ADMYE007" = "M9thGrade11", "ADMYE008" = "M10thGrade11", "ADMYE009" = "M11thGrade11", "ADMYE010" = "M12thGrade11", "ADMYE011" = "MHighSchoolGrad11", "ADMYE012" = "MSomeCollege111", "ADMYE013" = "MSomeCollege1Plus11", "ADMYE014" = "MAssociate11", "ADMYE015" = "MBachelor11", "ADMYE016" = "MMaster11", "ADMYE017" = "MProfessional11", "ADMYE018" = "MDoctorate11",
           "ADMYE019" = "FemalePop11", "ADMYE020" = "FNoSchool11", "ADMYE021" = "F4thGrade11", "ADMYE022" = "F5thGrade11", "ADMYE023" = "F7thGrade11", "ADMYE024" = "F9thGrade11", "ADMYE025" = "F10thGrade11", "ADMYE026" = "F11thGrade11", "ADMYE027" = "F12thGrade11", "ADMYE028" = "FHighSchoolGrad11", "ADMYE029" = "FSomeCollege111", "ADMYE030" = "FSomeCollege1Plus11", "ADMYE031" = "FAssociate11", "ADMYE032" = "FBachelor11", "ADMYE033" = "FMaster11", "ADMYE034" = "FProfessional11", "ADMYE035" = "FDoctorate11"))
#editing the coordinates data set
#renaming the columns
coordinates <- coordinates %>%
  rename(c("X__1" = "Sort", "X__2" = "State", "X__3" = "FIPS", "X__4" = "County", "X__5" = "CountySeats", "-2010" = "Population", "Longitude1" = "Longitude", "Latitude1" = "Latitude"))

#getting rid of unneeded columns 
coordinates <- coordinates %>%
  select(Sort, State, FIPS, County, Population, Longitude, Latitude)
#capitalizing the state name so I can merge by state name
Education2006[,3] = toupper(Education2006[,3])
Education2011[,3] = toupper(Education2011[,3])

GC2006[,3] = toupper(GC2006[,3])
GC2011[,3] = toupper(GC2011[,3])

PCI2006[,3] = toupper(PCI2006[,3])
PCI2011[,3] = toupper(PCI2011[,3])

Race2006[,3] = toupper(Race2006[,3])
Race2011[,3] = toupper(Race2011[,3])
#renaming a few patent variables, preparing for easier merger by changing FIPS name
patents <- patents %>%
  rename(c('Regional Area Component' = "County", 'FIPS Code' = "FIPS"))
#getting rid of parentheses in the County column
patents$CountyNew1 <-
gsub("\\s*\\([^\\)]+\\)","",as.character(patents$County))

patents$County <- patents$CountyNew1
#library(tm)

#stopwords = c("[()]", "", "city")
#stopwords = readLines('stopwords.txt')
#x = patents$County
#x = removeWords(x,stopwords)

#patents$County_new <- x

#patents <-  patents %>%
  #str_remove_all("(city)")
#first merger with coordinates and patents

pat_coord <- inner_join(patents, coordinates, by = "FIPS")
#renaming variables in set so it can merge
pat_coord <- pat_coord %>%
  rename(c("State" = "Abbreviation", 'State or Territory' = "STATE", "County.x" = "COUNTY"))

#merging patent and coordinates data set with gini coefficient

gcpcoord <- inner_join(pat_coord, GC2006, by = c("STATE", "COUNTY"))
## Warning: Column `COUNTY` joining character vector and factor, coercing into
## character vector
#merging all sets
all <- inner_join(gcpcoord, GC2011, by = c("STATE", "COUNTY")) %>%
  inner_join(., Race2006, by = c("STATE", "COUNTY")) %>%
  inner_join(., Race2011, by = c("STATE", "COUNTY")) %>%
  inner_join(., PCI2006, by = c("STATE", "COUNTY")) %>%
  inner_join(., PCI2011, by = c("STATE", "COUNTY")) %>%
  inner_join(., Education2006, by = c("STATE", "COUNTY")) %>%
  inner_join(., Education2011, by = c("STATE", "COUNTY")) 
## Warning: Column `COUNTY` joining character vector and factor, coercing into
## character vector

## Warning: Column `COUNTY` joining character vector and factor, coercing into
## character vector

## Warning: Column `COUNTY` joining character vector and factor, coercing into
## character vector

## Warning: Column `COUNTY` joining character vector and factor, coercing into
## character vector

## Warning: Column `COUNTY` joining character vector and factor, coercing into
## character vector

## Warning: Column `COUNTY` joining character vector and factor, coercing into
## character vector

## Warning: Column `COUNTY` joining character vector and factor, coercing into
## character vector
#editing the all data set and cutting out columns
#not including all the extra mixed races and margins of error
new_all <- all %>%
  select(FIPS:Total, Abbreviation, Longitude, Latitude, GC_Estimate6, GC_Estimate11, Total_Population6.x:Pacific6, Total_Population11.x:Pacific11, PCI_Estimate6, PCI_Estimate11, MalePop6:FDoctorate6, MalePop11:FDoctorate11)
#summing columns for patents 2006-2010
new_all$patents06 <- rowSums( new_all[,11:15])

#summing columns for patents 2011-2015
new_all$patents11 <- rowSums( new_all[,16:20])

#summing columns for education
#males who dropped out before graduating high school 2006-2010
new_all$MDropOut06 <- rowSums( new_all[,42:49])

#keep high school grad, but renaming it
new_all$MHighSchoolGrad06 <- new_all$MHighSchoolGrad6

#males with some college experience (from 1 year of college to associates)
new_all$MSomeCollege06 <- rowSums( new_all[,51:53])

#renaming bachelor degree
new_all$MBachelor06 <- new_all$MBachelor6

#males with post college education(masters, professional, doctorate)
new_all$MPostCollege06 <- rowSums( new_all[,55:57])

#DOING THE SAME FOR FEMALE EDUCATION (Repeat steps from Male education)

#summing columns for education
#males who dropped out before graduating high school 2006-2010
new_all$FDropOut06 <- rowSums( new_all[,59:66])

#keep high school grad, but renaming it
new_all$FHighSchoolGrad06 <- new_all$FHighSchoolGrad6

#females with some college experience (from 1 year of college to associates)
new_all$FSomeCollege06 <- rowSums( new_all[,68:71])

#renaming bachelor degree
new_all$FBachelor06 <- new_all$FBachelor6

#females with post college education(masters, professional, doctorate)
new_all$FPostCollege06 <- rowSums( new_all[,72:74])
#Repeating the entire previous chunk but instead with the year 2011 instead of 2006

new_all$MDropOut011 <- rowSums( new_all[,76:83])

#keep high school grad, but renaming it
new_all$MHighSchoolGrad011 <- new_all$MHighSchoolGrad11

#males with some college experience (from 1 year of college to associates)
new_all$MSomeCollege011 <- rowSums( new_all[,85:87])

#renaming bachelor degree
new_all$MBachelor011 <- new_all$MBachelor11

#males with post college education(masters, professional, doctorate)
new_all$MPostCollege011 <- rowSums( new_all[,89:91])

#DOING THE SAME FOR FEMALE EDUCATION (Repeat steps from Male education)

#summing columns for education
#males who dropped out before graduating high school 2006-2010
new_all$FDropOut011 <- rowSums( new_all[,93:100])

#keep high school grad, but renaming it
new_all$FHighSchoolGrad011 <- new_all$FHighSchoolGrad11

#females with some college experience (from 1 year of college to associates)
new_all$FSomeCollege06 <- rowSums( new_all[,102:104])

#renaming bachelor degree
new_all$FBachelor011 <- new_all$FBachelor11

#females with post college education(masters, professional, doctorate)
new_all$FPostCollege011 <- rowSums( new_all[,106:108])
#getting my final data set comprised of variables I need
final_all <- new_all %>%
  select(FIPS:COUNTY, patents06, patents11, Abbreviation, Longitude, Latitude, GC_Estimate6, GC_Estimate11, Total_Population6.x:MalePop6, MDropOut06:MPostCollege06, FemalePop6, FDropOut06:FPostCollege06, MalePop11, MDropOut011:MPostCollege011, FemalePop11, FDropOut011:FPostCollege011)
#black and white race ratio, pci, bachelors and post college ratio, 
#creating per capita and ratio variables

#PATENT PER CAPITA
#patent per capita in 2006
final_all <- final_all %>%
  mutate(PatentRatio06 = patents06 / Total_Population6.x)

#patent per capita in 2011
final_all <- final_all %>%
  mutate(PatentRatio11 = patents11 / Total_Population11.x)

#RACE(WHITE RATIO 1st and BLACK RATIO 2nd)

#White
#White racial ratio in 2006
final_all <- final_all %>%
  mutate(WhiteRatio06 = White6 / Total_Population6.x)

#White racial ratio in 2011
final_all <- final_all %>%
  mutate(WhiteRatio11 = White11 / Total_Population11.x)

#BLACK
#black racial ratio in 2006
final_all <- final_all %>%
  mutate(BlackRatio06 = Black6 / Total_Population6.x)

#Black racial ratio in 2011
final_all <- final_all %>%
  mutate(BlackRatio11 = Black11 / Total_Population11.x)

#EDUCATION (BACHELORS AND POST COLLEGE)

#MALE
#Male Bachelor Ratio in 2006
final_all <- final_all %>%
  mutate(MBachRatio06 = MBachelor06 / Total_Population6.x)

#Male Bachelor Ratio in 2011
final_all <- final_all %>%
  mutate(MBachRatio11 = MBachelor011 / Total_Population11.x)

#Male Post College Ratio in 2006
final_all <- final_all %>%
  mutate(MPostRatio06 = MPostCollege06 / Total_Population6.x)

#Male Post College Ratio in 2011
final_all <- final_all %>%
  mutate(MPostRatio11 = MPostCollege011 / Total_Population11.x)

#FEMALE
#Female Bachelor Ratio in 2006
final_all <- final_all %>%
  mutate(FBachRatio06 = FBachelor06 / Total_Population6.x)

#Female Bachelor Ratio in 2011
final_all <- final_all %>%
  mutate(FBachRatio11 = FBachelor011 / Total_Population11.x)

#Female Post College Ratio in 2006
final_all <- final_all %>%
  mutate(FPostRatio06 = FPostCollege06 / Total_Population6.x)

#Female Post College Ratio in 2011
final_all <- final_all %>%
  mutate(FPostRatio11 = FPostCollege011 / Total_Population11.x)
#Final and complete data set with few variables
final <- final_all %>%
  select(FIPS:COUNTY, Abbreviation:GC_Estimate11, Total_Population6.x, Total_Population11.x, PCI_Estimate6, PCI_Estimate11, PatentRatio06:FPostRatio11)
#linear regression
#trying out log


fit <- 
  lm(PatentRatio06 ~ GC_Estimate6 + PCI_Estimate6 + WhiteRatio06 + BlackRatio06 + MBachRatio06 + FBachRatio06, data = final)

fit2 <- 
   lm(PatentRatio11 ~ GC_Estimate6 + PCI_Estimate6 + WhiteRatio06 + BlackRatio06 + MBachRatio06 + FBachRatio06, data = final)

fit3 <-
   lm(PatentRatio11 ~ GC_Estimate11 + PCI_Estimate11 + WhiteRatio11 + BlackRatio11 + MBachRatio11 + FBachRatio11, data = final)
#trying a new fit with big counties 
big <- final %>%
  filter(Total_Population11.x >= 90000)

fit4 <- 
  lm(PatentRatio06 ~ GC_Estimate6 + PCI_Estimate6 + WhiteRatio06 + BlackRatio06 + MBachRatio06 + FBachRatio06, data = big)

fit5 <- 
  lm(PatentRatio06 ~ PCI_Estimate6 + WhiteRatio06 + BlackRatio06 + MBachRatio06 + FBachRatio06, data = final)
ditch_the_axes <- theme(
  axis.text = element_blank(),
  axis.line = element_blank(),
  axis.ticks = element_blank(),
  panel.border = element_blank(),
  panel.grid = element_blank(),
  axis.title = element_blank()
  )

counties <- map_data("county")

#capitalizing the state of the counties data set
counties[,5] = toupper(counties[,5])

counties$subregion <- paste0(counties$subregion ," county")

#capitalizing first letter of county variable
simpleCap <- function(x) {
  s <- strsplit(x, " ")[[1]]
  paste(toupper(substring(s, 1,1)), substring(s, 2),
      sep="", collapse=" ")
}

counties$subregion <- sapply(counties$subregion, simpleCap)

#renaming variables
counties <- counties %>%
  rename(c("subregion" = "COUNTY", "region" = "STATE"))

#merging counties data set and final data set
mapheat <- left_join(counties, final, by = c("COUNTY","STATE"))

#working on map

states <- map_data("state")

usmap <- ggplot(data = states, mapping = aes(x = long, y = lat, group = group)) + coord_fixed(1.3) + 
  geom_polygon(color = "black", fill = "white")

#heat maps using patent
patentmap6 <- usmap + geom_polygon(data = mapheat, aes(fill = PatentRatio06), color = "black") + geom_polygon(color = "white", fill = NA) + theme_bw() + ditch_the_axes

patentmap6 + scale_fill_gradientn(colours = rev(rainbow(2)),
                                   breaks = c(.0005, .001, .005, .01, .02),
                                   trans = "log10")
## Warning: Transformation introduced infinite values in discrete y-axis

patentmap11 <- usmap + geom_polygon(data = mapheat, aes(fill = PatentRatio11), color = "black") + geom_polygon(color = "white", fill = NA) + theme_bw() + ditch_the_axes

patentmap11 + scale_fill_gradientn(colours = rev(rainbow(2)),
                                   breaks = c(.0005, .001, .005, .01, .02),
                                   trans = "log10")
## Warning: Transformation introduced infinite values in discrete y-axis

#heat maps using gini coefficient
gc6 <- usmap + geom_polygon(data = mapheat, aes(fill = GC_Estimate6), color = "black") + geom_polygon(color = "white", fill = NA) + theme_bw() + ditch_the_axes

gc6 + scale_fill_gradientn(colours = rev(rainbow(2)),
                                   trans = "log10")

gc11 <- usmap + geom_polygon(data = mapheat, aes(fill = GC_Estimate11), color = "black") + geom_polygon(color = "white", fill = NA) + theme_bw() + ditch_the_axes

gc11 + scale_fill_gradientn(colours = rev(rainbow(2)),
                                   trans = "log10")

library(jtools)
## Warning: package 'jtools' was built under R version 3.5.2
summ(fit)
Observations 2872 (1 missing obs. deleted)
Dependent variable PatentRatio06
Type OLS linear regression
F(6,2865) 130.90
0.22
Adj. R² 0.21
Est. S.E. t val. p
(Intercept) -0.00 0.00 -0.86 0.39
GC_Estimate6 -0.00 0.00 -0.11 0.91
PCI_Estimate6 0.00 0.00 9.11 0.00
WhiteRatio06 -0.00 0.00 -4.67 0.00
BlackRatio06 -0.00 0.00 -4.16 0.00
MBachRatio06 0.01 0.00 2.12 0.03
FBachRatio06 0.01 0.00 2.73 0.01
Standard errors: OLS
#usmap <- ggplot(data = counties) +
#  geom_polygon(aes(x = long, y = lat, group = group), fill = "white", color = "black") + 
#  coord_fixed(1.3) + guides(fill=FALSE)

#hmap <- ggplot(data = mapheat) +
#  geom_polygon(aes(x = long, y = lat, group = group), fill = "white", color = "black") + 
#  coord_fixed(1.3) + guides(fill=FALSE)

#hmap <- usmap + geom_polygon(data = counties) + geom_polygon(aes(x = long, y = lat, group = group), fill = "white", color = "black")

#hmap <- usmap + geom_polygon(data = counties, fill = NA, color = "black") + geom_polygon(color = "white", fill = NA)
#fmap <- usmap + geom_point(data = final, aes(x = Longitude, y = Latitude), color = "black", size = 1, alpha = .5)